Introduction

This document is a walkthrough of how to use the code for the Turnout Tracker. For a discussion of the math and the model, see Turnout Tracker Math

Before Election Day: Fitting the Model

There are a number of parameters that need to be fit on historical data: baseline turnout rates, precinct covariances, etc.

Each election should have a config, which I’ve created in config.R. config is a list with the following items:

library(tidyverse)

source("config.R")
print(config)
## $city
## [1] "Chicago"
## 
## $city_filename
## [1] "chicago"
## 
## $timezone
## [1] "America/Chicago"
## 
## $election_ds
## [1] "2019-04-02"
## 
## $start_hour
## [1] 6
## 
## $end_hour
## [1] 19
## 
## $precinct_shp_path
## [1] "data/Precincts (current).shp"
## 
## $get_precinct_id
## function (df) 
## sprintf("%02d%02d", df$ward, df$precinct)
## 
## $get_ward_from_precinct
## function (precinct) 
## substr(precinct, 1, 2)
## 
## $turnout_df_path
## [1] "data/turnout_df.csv"
## 
## $submission_bitly
## [1] "http://bit.ly/chicagoturnout"
## 
## $google_doc
## [1] "https://docs.google.com/spreadsheets/d/1W8Y6kC4OctVv2xqLaixo_47-c0EbX8rGY-M6XIPHgIM"
## 
## $ref_turnout
##       March 2015 Runoff 
##      452529      592524 
## 
## $site_name
## [1] "Fifty Wards"
## 
## $precinct_name
## [1] "precinct"
## 
## $ward_name
## [1] "ward"
## 
## $use_real_data
## [1] FALSE

All of the prep work is done in calc_params. The input is a dataframe, turnout_df, which has columns precinct, year, turnout. Precinct is the unique identifier for the precinct, year is the date of the election (this doesn’t have to be a year, but can be any identifier, e.g. “2018-03”), and turnout is the voter count. Internally, we’ll validate that there is a turnout value for every precinct in every year, so you will need to crosswalk turnout if boundaries have changed.

df <- read_csv(config$turnout_df_path)
head(arrange(df, precinct, year))
## # A tibble: 6 x 3
##   precinct    year turnout
##      <chr>   <chr>   <int>
## 1     0101 2014-11     299
## 2     0101 2015-02     196
## 3     0101 2015-04     259
## 4     0101 2016-03     455
## 5     0101 2016-11     637
## 6     0101 2018-11     594

We can now calculate the historic modelParams:

source("../calc_params.R", chdir=TRUE)

params <- calc_params(
  turnout_df=df, 
  n_svd=3
) 
## [1] "Fitting fixed effects"
## [1] "Calculating svd"
## [1] "Fitted vs True values, check for similarity:"
## [1] "Fitted:"
##             [,1]       [,2]        [,3]        [,4]      [,5]      [,6]
## [1,] -0.11038728 -0.2708403 -0.19458832  0.15395375 0.1340909 0.1899450
## [2,] -0.04237813 -0.1945788 -0.12367409  0.17382635 0.1650317 0.0943869
## [3,] -0.11824162 -0.2850754 -0.17629177  0.06149841 0.1655323 0.2650579
## [4,] -0.09362602 -0.2108742 -0.11321424 -0.03062924 0.1300497 0.2455194
## [5,] -0.15771549 -0.3839398 -0.22756622  0.05512426 0.2346070 0.3748582
## [6,] -0.04085355 -0.1467579 -0.07300951  0.03622818 0.1310929 0.1328661
## [1] "True:"
##       2014-11    2015-02     2015-04     2016-03    2016-11   2018-11
## 1 -0.15274621 -0.2470071 -0.17840995  0.18554817 0.08712289 0.2401797
## 2 -0.15628468 -0.1428391 -0.08857101  0.21494522 0.13829907 0.1408920
## 3 -0.14804698 -0.2380893 -0.20984024  0.06231033 0.17968719 0.2731712
## 4 -0.15230519 -0.1295680 -0.16614874 -0.02804357 0.15613147 0.2584082
## 5 -0.09414886 -0.3471843 -0.31906444  0.03612364 0.23799685 0.3854325
## 6  0.02638586 -0.1663075 -0.09093671  0.04177403 0.07904893 0.1668092
## [1] "Calculating covariances"
## params has a copy of turnout_df, with some new columns.
print(head(params@turnout_df))
## # A tibble: 6 x 5
##   precinct    year turnout log_turnout precinct_num
##     <fctr>   <chr>   <int>       <dbl>        <dbl>
## 1     0101 2014-11     299    5.703782            1
## 2     0101 2015-02     196    5.283204            1
## 3     0101 2015-04     259    5.560682            1
## 4     0101 2016-03     455    6.122493            1
## 5     0101 2016-11     637    6.458338            1
## 6     0101 2018-11     594    6.388561            1
## params has an estimate of the year_fe, on the log scale.
print(head(params@year_fe))
## # A tibble: 6 x 2
##      year  year_fe
##     <chr>    <dbl>
## 1 2014-11 5.735261
## 2 2015-02 5.408943
## 3 2015-04 5.617823
## 4 2016-03 5.815676
## 5 2016-11 6.249947
## 6 2018-11 6.027114
## params has an estimate of the precinct_fe, on the log scale.
print(head(params@precinct_fe))
## # A tibble: 6 x 2
##   precinct precinct_fe
##     <fctr>       <dbl>
## 1     0101  0.12126818
## 2     0102  0.01201116
## 3     0103  0.19043880
## 4     0104  0.17594645
## 5     0105  0.08573610
## 6     0106  0.14643658
## params has the svd results, which is used for the covariance (more on this later).
print(head(params@svd$u))
##             [,1]         [,2]         [,3]
## [1,] -0.03672277  0.016845021 -0.019538640
## [2,] -0.02710109  0.019408106  0.010315728
## [3,] -0.04147465  0.003713562 -0.018337205
## [4,] -0.03249335 -0.007669779 -0.015206827
## [5,] -0.05678227  0.001050900 -0.022566483
## [6,] -0.02261681  0.001705908  0.004857801
print(head(params@svd$v))
##             [,1]         [,2]        [,3]
## [1,]  0.19044875 -0.003779145  0.22946596
## [2,]  0.57853435 -0.141128508  0.08072853
## [3,]  0.29868500 -0.285404538  0.21703637
## [4,] -0.08600731  0.856518268  0.04917774
## [5,] -0.46396848 -0.029105537  0.39591432
## [6,] -0.56145802 -0.405066149 -0.12432520
## params has the estimated covariance matrix among precincts (and its inverse)
print(params@precinct_cov[1:6, 1:6])
##            [,1]       [,2]       [,3]       [,4]       [,5]       [,6]
## [1,] 0.03857657 0.02352642 0.03585651 0.02599266 0.04784356 0.01716449
## [2,] 0.02352642 0.02455144 0.02315902 0.01550476 0.03093631 0.01377707
## [3,] 0.03585651 0.02315902 0.04269169 0.03037236 0.05316001 0.01920352
## [4,] 0.02599266 0.01550476 0.03037236 0.02836700 0.04166078 0.01478648
## [5,] 0.04784356 0.03093631 0.05316001 0.04166078 0.07572330 0.02630373
## [6,] 0.01716449 0.01377707 0.01920352 0.01478648 0.02630373 0.01449630

I also provide some helper functions to make diagnostic plots. These require an sf object with the precinct shapefiles. The sf must have a column precinct which matches the id column in turnout_df.

The diagnostics include plots of (a) the fixed effects by precinct and by year, and (b) the svd components for the estimated covariances, along with each dimension’s score in each year. You should sanity check that the combination of precincts and elections make sense.

library(sf)
divs <- safe_load("data/precincts.Rda")
head(divs)
## Simple feature collection with 6 features and 5 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -87.73835 ymin: 41.96853 xmax: -87.69203 ymax: 41.99045
## epsg (SRID):    4326
## proj4string:    +proj=longlat +ellps=WGS84 +no_defs
##   full_text precinct shape_area shape_len ward
## 1     39012     3912    2402920  8750.364   39
## 2     39039     3939    8055746 12090.673   39
## 3     40001     4001    1756462  6868.847   40
## 4     40002     4002    1999752  7439.578   40
## 5     40005     4005    1999124  7366.565   40
## 6     40006     4006    1238134  5314.833   40
##                         geometry
## 1 MULTIPOLYGON (((-87.7226330...
## 2 MULTIPOLYGON (((-87.7284329...
## 3 MULTIPOLYGON (((-87.6930399...
## 4 MULTIPOLYGON (((-87.6956910...
## 5 MULTIPOLYGON (((-87.6920299...
## 6 MULTIPOLYGON (((-87.6938729...
diagnostics(params, divs)
## [1] "Plotting Diagnostics..."

## Press <Enter> to continue...

## Press <Enter> to continue...

## Press <Enter> to continue...

## Press <Enter> to continue...

## Press <Enter> to continue...

## Press <Enter> to continue...

## Press <Enter> to continue...

## Press <Enter> to continue...

Let’s save the results and move on.

save_with_backup(params, stem="params", dir="outputs")

Testing on Fake Data

An important validation is to test the model on a fake, known turnout pattern. The function load_data will either load data from our google-form download (later), or create a fake dataset with an S-curve.

source("../fit_submissions.R", chdir=TRUE)

data_list <- load_data(use_real_data=FALSE, params=params, election_config=config)
raw_data <- data_list$raw_data
fake_data <- data_list$fake_data

print("True Turnout to be estimated:")
## [1] "True Turnout to be estimated:"
print(fake_data$true_turnout)
## [1] 595190.6
em_fit <- fit_em_model(
  raw_data, 
  params, 
  verbose=FALSE, 
  tol=1e-10, 
  use_inverse=FALSE, 
  election_config=config
)
## [1] "n_iter = 20"
fit <- process_results(
  em_fit$precinct_re_fit, 
  em_fit$loess_fit, 
  raw_data,
  em_fit$resid,
  params,
  election_config=config,
  plots = TRUE, 
  save_results = FALSE,
  fake_data=fake_data
)
## [1] "predicting loess"
## [1] "plots"

## Press <Enter> to continue...

## Press <Enter> to continue...

## Press <Enter> to continue...

## Press <Enter> to continue...
## [1] "div_turnout"
## [1] "time_df"
## [1] "full_predictions"
print("Estimate:")
## [1] "Estimate:"
fit$full_predictions %>% 
  filter(time_of_day == max(time_of_day)) %>%
  with(sum(prediction))
## [1] 569143.5

But we don’t want a single estimate, we want a bootstrap of estimates. This can take a few minutes…:

source("../bootstrap.R", chdir=TRUE)

bs <- fit_bootstrap(
  raw_data,
  params,
  election_config=config,
  n_boot=40,
  use_inverse=FALSE,
  verbose=FALSE
)
## [1] "Raw Result"
## [1] "n_iter = 20"
## [1] "n_iter = 28"
## [1] "n_iter = 29"
## [1] "n_iter = 28"
## [1] "n_iter = 28"
## [1] "n_iter = 31"
## [1] "n_iter = 29"
## [1] "n_iter = 37"
## [1] "n_iter = 28"
## [1] "n_iter = 25"
## [1] "n_iter = 30"
## [1] "n_iter = 25"
## [1] "n_iter = 27"
## [1] "n_iter = 28"
## [1] "n_iter = 30"
## [1] "n_iter = 27"
## [1] "n_iter = 27"
## [1] "n_iter = 29"
## [1] "n_iter = 32"
## [1] "n_iter = 29"
## [1] "n_iter = 27"
## [1] "n_iter = 26"
## [1] "n_iter = 28"
## [1] "n_iter = 29"
## [1] "n_iter = 27"
## [1] "n_iter = 27"
## [1] "n_iter = 28"
## [1] "n_iter = 29"
## [1] "n_iter = 26"
## [1] "n_iter = 27"
## [1] "n_iter = 26"
## [1] "n_iter = 26"
## [1] "n_iter = 30"
## [1] "n_iter = 26"
## [1] "n_iter = 27"
## [1] "n_iter = 30"
## [1] "n_iter = 28"
## [1] "n_iter = 27"
## [1] "n_iter = 21"
## [1] "n_iter = 27"
## [1] "n_iter = 27"
## [1] "BS Turnout: 546355 573813 590517"
gg_bs_hist <- hist_bootstrap(bs) 
print(gg_bs_hist)

gg_turnout <- turnout_plot(
  bs,
  raw_data,
  config
)
print(gg_turnout)

save_with_backup(
  bs,
  stem="bootstrap",
  dir="outputs"
)  

for(plotname in c("gg_turnout", "gg_bs_hist")){
  ggsave_with_backup(
    get(plotname), 
    filestem=plotname,
    plottype="png",
    width = 7,
    height=7,
    dir='outputs'
  )
}

On Election Day

There are a few specifics that need to be handled on election day. The file run_all.R does three things: (a) download the google data, (b) run election_tracker.Rmd, which recreates the code above, and then (c) push the html to github using upload_git.bat.